## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
## 
## Loading required package: plyr
## Loading required package: stringr

Part of the idea is that for the Laplace noising to work we have to plug in a sigma (level of noising). We simulate having a very good methodology to do so by supplying dCal a large calibration set to pick sigma. In practice you don’t have such a set and would need to either know sigma from first principles or experience, or use some of your training data to build it. What we want to demonstrate is the effectiveness of the differential privacy inspired Laplace nosing technique, so we will give it a good sigma (which one may or may not have in actual practice).

print('naive effects model')
## [1] "naive effects model"
bCoder <- trainEffectCoderR(dTrain,yName,vars,0)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
## 
## Call:
## lm(formula = formulaB, data = dTrainB)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.36368 -0.38461  0.00547  0.37974  1.76628 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.08056    0.01285   6.270 4.44e-10 ***
## x1           0.26619    0.02110  12.617  < 2e-16 ***
## x2           0.25160    0.03405   7.389 2.18e-13 ***
## x3           0.19060    0.05277   3.612 0.000312 ***
## x4           0.27750    0.02483  11.178  < 2e-16 ***
## x5           0.27266    0.02451  11.122  < 2e-16 ***
## x6           0.21281    0.03761   5.658 1.75e-08 ***
## x7           0.25934    0.02207  11.752  < 2e-16 ***
## x8           0.28974    0.02194  13.208  < 2e-16 ***
## x9           0.24599    0.02020  12.175  < 2e-16 ***
## x10          0.26001    0.01797  14.465  < 2e-16 ***
## n1           0.09067    0.01400   6.475 1.19e-10 ***
## n2           0.09275    0.01416   6.552 7.26e-11 ***
## n3           0.11523    0.01392   8.277 2.31e-16 ***
## n4           0.08853    0.01403   6.308 3.48e-10 ***
## n5           0.07794    0.01441   5.410 7.09e-08 ***
## n6           0.09353    0.01446   6.468 1.25e-10 ***
## n7           0.08617    0.01443   5.974 2.74e-09 ***
## n8           0.10136    0.01409   7.196 8.80e-13 ***
## n9           0.09799    0.01410   6.949 5.00e-12 ***
## n10          0.10318    0.01430   7.214 7.70e-13 ***
## n11          0.11272    0.01402   8.041 1.53e-15 ***
## n12          0.08253    0.01430   5.773 9.04e-09 ***
## n13          0.10629    0.01451   7.324 3.51e-13 ***
## n14          0.10574    0.01409   7.502 9.46e-14 ***
## n15          0.11166    0.01427   7.827 8.13e-15 ***
## n16          0.08986    0.01379   6.518 9.01e-11 ***
## n17          0.08851    0.01406   6.295 3.78e-10 ***
## n18          0.09905    0.01404   7.056 2.36e-12 ***
## n19          0.09338    0.01476   6.328 3.07e-10 ***
## n20          0.11530    0.01387   8.311  < 2e-16 ***
## n21          0.09641    0.01424   6.769 1.71e-11 ***
## n22          0.10090    0.01397   7.225 7.15e-13 ***
## n23          0.08629    0.01374   6.282 4.10e-10 ***
## n24          0.10409    0.01408   7.392 2.12e-13 ***
## n25          0.08755    0.01444   6.063 1.59e-09 ***
## n26          0.09146    0.01401   6.528 8.46e-11 ***
## n27          0.10086    0.01433   7.038 2.69e-12 ***
## n28          0.10048    0.01373   7.316 3.72e-13 ***
## n29          0.09177    0.01455   6.305 3.55e-10 ***
## n30          0.08877    0.01417   6.265 4.57e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5746 on 1959 degrees of freedom
## Multiple R-squared:  0.927,  Adjusted R-squared:  0.9255 
## F-statistic: 622.2 on 40 and 1959 DF,  p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 0.568682746837468"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
                           'naive effects model train',
                           smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## Warning: Removed 5 rows containing missing values (geom_smooth).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.140]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.80270755169949"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
                           'naive effects model test',
                           smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.293]
# try re-fitting pred to see if it is just a shift/scale issue
# it is unlikely to be such, but good to look
f2 <- paste(yName,'pred',sep=' ~ ')
m2 <- lm(f2,data=dTestB)  # dileberately fitting on test itself (to show it does not help)
print(summary(m2))
## 
## Call:
## lm(formula = f2, data = dTestB)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.6308 -1.2125 -0.0242  1.1666  6.6364 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.08973    0.01789  -5.016 5.37e-07 ***
## pred         1.35606    0.02195  61.781  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.779 on 9998 degrees of freedom
## Multiple R-squared:  0.2763, Adjusted R-squared:  0.2762 
## F-statistic:  3817 on 1 and 9998 DF,  p-value: < 2.2e-16
dTestB$pred2 <- predict(m2,newdata=dTestB)
print(paste('adjusted test rmse',rmse(dTestB$pred2,dTestB[[yName]])))
## [1] "adjusted test rmse 1.77849696252789"
print(WVPlots::ScatterHist(dTestB,'pred2',yName,
                           'naive effects model test (adjusted on test)',
                           smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.446]
print(paste('effects model, sigma=',bSigmaBest))
## [1] "effects model, sigma= 18"
bCoder <- trainEffectCoderR(dTrain,yName,vars,bSigmaBest)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
## 
## Call:
## lm(formula = formulaB, data = dTrainB)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3381 -0.7288 -0.0028  0.7294  4.1872 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.1144731  0.0254224   4.503 7.10e-06 ***
## x1          0.8196186  0.0366025  22.392  < 2e-16 ***
## x2          0.8107393  0.0613603  13.213  < 2e-16 ***
## x3          0.5370312  0.0734711   7.309 3.89e-13 ***
## x4          0.6955862  0.0347961  19.990  < 2e-16 ***
## x5          0.9399467  0.0462294  20.332  < 2e-16 ***
## x6          0.8020531  0.0666998  12.025  < 2e-16 ***
## x7          0.8330830  0.0357499  23.303  < 2e-16 ***
## x8          0.9735313  0.0356524  27.306  < 2e-16 ***
## x9          0.9739952  0.0370505  26.288  < 2e-16 ***
## x10         0.9026969  0.0307460  29.360  < 2e-16 ***
## n1          0.0007364  0.0014324   0.514 0.607233    
## n2          0.0013764  0.0015041   0.915 0.360253    
## n3          0.0042308  0.0016007   2.643 0.008280 ** 
## n4          0.0014212  0.0014834   0.958 0.338151    
## n5          0.0022101  0.0014995   1.474 0.140667    
## n6          0.0001683  0.0015206   0.111 0.911892    
## n7          0.0024897  0.0014427   1.726 0.084548 .  
## n8          0.0031708  0.0014498   2.187 0.028852 *  
## n9          0.0020965  0.0013381   1.567 0.117336    
## n10         0.0014853  0.0015310   0.970 0.332073    
## n11         0.0021847  0.0014032   1.557 0.119648    
## n12         0.0011659  0.0014552   0.801 0.423118    
## n13         0.0018880  0.0012354   1.528 0.126630    
## n14         0.0042721  0.0013614   3.138 0.001726 ** 
## n15         0.0028255  0.0015850   1.783 0.074805 .  
## n16         0.0052796  0.0013157   4.013 6.22e-05 ***
## n17         0.0040144  0.0013349   3.007 0.002670 ** 
## n18         0.0029941  0.0014040   2.132 0.033092 *  
## n19         0.0032835  0.0014587   2.251 0.024497 *  
## n20         0.0015390  0.0014401   1.069 0.285347    
## n21         0.0046751  0.0013568   3.446 0.000582 ***
## n22         0.0033080  0.0014532   2.276 0.022931 *  
## n23         0.0014003  0.0012182   1.149 0.250506    
## n24         0.0028375  0.0016629   1.706 0.088100 .  
## n25         0.0015065  0.0014213   1.060 0.289319    
## n26         0.0043705  0.0016020   2.728 0.006425 ** 
## n27         0.0052489  0.0015203   3.453 0.000567 ***
## n28         0.0045239  0.0016781   2.696 0.007082 ** 
## n29         0.0028452  0.0016432   1.732 0.083516 .  
## n30         0.0037814  0.0014427   2.621 0.008835 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.08 on 1959 degrees of freedom
## Multiple R-squared:  0.7421, Adjusted R-squared:  0.7369 
## F-statistic:   141 on 40 and 1959 DF,  p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 1.06902793009366"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
                           paste('effects model train, sigma=',bSigmaBest),
                           smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.635]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.17820921023282"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
                           paste('effects model test, sigma=',bSigmaBest),
                           smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.788]
print('effects model, jacknifed')
## [1] "effects model, jacknifed"
bCoder <- trainEffectCoderR(dTrain,yName,vars,0)
# dTrainB <- bCoder$codeFrame(dTrain)
# dTrainB <- bCoder$codeFrame(dCal)
dTrainB <- jackknifeEffectCodeR(dTrain,yName,vars)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
## 
## Call:
## lm(formula = formulaB, data = dTrainB)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0503 -0.7460  0.0110  0.6973  3.9378 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.080580   0.024499   3.289  0.00102 ** 
## x1           0.878376   0.036480  24.078  < 2e-16 ***
## x2           0.891187   0.062232  14.320  < 2e-16 ***
## x3           0.769654   0.098820   7.788 1.09e-14 ***
## x4           0.944314   0.043679  21.619  < 2e-16 ***
## x5           0.945152   0.042897  22.033  < 2e-16 ***
## x6           0.849552   0.069355  12.249  < 2e-16 ***
## x7           0.949746   0.037648  25.227  < 2e-16 ***
## x8           1.043654   0.036410  28.664  < 2e-16 ***
## x9           0.962349   0.033361  28.847  < 2e-16 ***
## x10          0.925969   0.029086  31.836  < 2e-16 ***
## n1           0.027136   0.020443   1.327  0.18454    
## n2          -0.009259   0.020976  -0.441  0.65896    
## n3           0.031421   0.020508   1.532  0.12565    
## n4          -0.030794   0.020589  -1.496  0.13491    
## n5          -0.020427   0.020843  -0.980  0.32720    
## n6           0.009093   0.021514   0.423  0.67261    
## n7           0.013300   0.021150   0.629  0.52953    
## n8           0.008724   0.020401   0.428  0.66897    
## n9          -0.001165   0.020370  -0.057  0.95440    
## n10         -0.001888   0.021453  -0.088  0.92990    
## n11          0.012744   0.020469   0.623  0.53364    
## n12         -0.001472   0.020607  -0.071  0.94306    
## n13         -0.034807   0.020765  -1.676  0.09385 .  
## n14          0.019618   0.020409   0.961  0.33655    
## n15          0.027026   0.020765   1.302  0.19323    
## n16         -0.002004   0.019604  -0.102  0.91859    
## n17          0.003144   0.020437   0.154  0.87777    
## n18          0.008286   0.020588   0.402  0.68738    
## n19         -0.040983   0.021189  -1.934  0.05324 .  
## n20         -0.012028   0.020187  -0.596  0.55136    
## n21         -0.024144   0.021062  -1.146  0.25179    
## n22          0.016472   0.020233   0.814  0.41567    
## n23         -0.012645   0.020121  -0.628  0.52976    
## n24          0.020160   0.021181   0.952  0.34132    
## n25         -0.049179   0.020551  -2.393  0.01680 *  
## n26          0.021604   0.020720   1.043  0.29723    
## n27         -0.022748   0.021124  -1.077  0.28166    
## n28         -0.008041   0.020318  -0.396  0.69231    
## n29         -0.036017   0.021344  -1.687  0.09167 .  
## n30          0.009637   0.020581   0.468  0.63968    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.095 on 1959 degrees of freedom
## Multiple R-squared:  0.7349, Adjusted R-squared:  0.7295 
## F-statistic: 135.8 on 40 and 1959 DF,  p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 1.08397286675349"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
                       'effects model train, jackknifed',
                         smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.965]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.0755272623194"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
                       'effects model test, jackknifed',
                         smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                 grob
## 1 1 (2-2,1-1) arrange       gtable[layout]
## 2 2 (2-2,2-2) arrange       gtable[layout]
## 3 3 (3-3,1-1) arrange       gtable[layout]
## 4 4 (3-3,2-2) arrange       gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1118]
print("vtreat split model")
## [1] "vtreat split model"
pruneSig = 0.05

print("working vtreat split model")
## [1] "working vtreat split model"
mTitle <- 'vtreat split model'
isTrain <- runif(nrow(dTrain))<=0.5
dTrainDT <- dTrain[isTrain,]
dTrainDC <- dTrain[!isTrain,]
treatments <- vtreat::designTreatmentsN(dTrainDC,vars,yName,
                                        rareSig=0.3,
                                        smFactor=5.0,
                                        minFraction=2.0,
                                        verbose=FALSE,
                                        parallelCluster=cl)
dTrainV <- vtreat::prepare(treatments,dTrainDT,pruneSig=pruneSig,
                           parallelCluster=cl)

#print(treatments$scoreFrame)
varsV <- intersect(colnames(dTrainV),
                   treatments$scoreFrame$varName[treatments$scoreFrame$sig<pruneSig])
print(varsV)
##  [1] "x1_catP"  "x1_catN"  "x2_catN"  "x4_catN"  "x4_catD"  "x5_catN" 
##  [7] "x5_catD"  "x6_catN"  "x7_catN"  "x7_catD"  "x8_catN"  "x9_catN" 
## [13] "x9_catD"  "x10_catP" "x10_catN" "x10_catD" "n20_catN"
dTestV <- vtreat::prepare(treatments,dTest,pruneSig=pruneSig,
                          varRestriction=varsV,
                          parallelCluster=cl)
formulaV <- paste(yName,paste(varsV,collapse=' + '),sep=' ~ ')
modelV <- lm(formulaV,data=dTrainV)
dTrainV$pred <- predict(modelV,newdata=dTrainV)
print(paste('train rmse',rmse(dTrainV$pred,dTrainV[[yName]])))
## [1] "train rmse 1.14302240429833"
print(WVPlots::ScatterHist(dTrainV,'pred',yName,
                         paste(mTitle,'train'),
                         smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                 grob
## 1 1 (2-2,1-1) arrange       gtable[layout]
## 2 2 (2-2,2-2) arrange       gtable[layout]
## 3 3 (3-3,1-1) arrange       gtable[layout]
## 4 4 (3-3,2-2) arrange       gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1295]
dTestV$pred <- predict(modelV,newdata=dTestV)
print(paste('test rmse',rmse(dTestV$pred,dTestV[[yName]])))
## [1] "test rmse 1.14827374668532"
print(WVPlots::ScatterHist(dTestV,'pred',yName,
                         paste(mTitle,'test'),
                         smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                 grob
## 1 1 (2-2,1-1) arrange       gtable[layout]
## 2 2 (2-2,2-2) arrange       gtable[layout]
## 3 3 (3-3,1-1) arrange       gtable[layout]
## 4 4 (3-3,2-2) arrange       gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1448]
print("vtreat cross model")
## [1] "vtreat cross model"
pruneSig = 0.05
if("mkCrossFrameNExperiment" %in% ls('package:vtreat')) {
  print("working vtreat cross model")
  mTitle <- 'vtreat cross model'
  crossD <- vtreat::mkCrossFrameNExperiment(dTrain,vars,yName,
                                            rareSig=0.3,
                                            smFactor=5.0,
                                            minFraction=2.0,
                                            parallelCluster=cl)
  treatments <- crossD$treatments
  dTrainV <- crossD$crossFrame
  
#  print(treatments$scoreFrame)
  varsV <- intersect(colnames(dTrainV),
                     treatments$scoreFrame$varName[treatments$scoreFrame$sig<pruneSig])
  print(varsV)
  dTestV <- vtreat::prepare(treatments,dTest,pruneSig=pruneSig,
                            varRestriction=varsV,
                            parallelCluster=cl)
  formulaV <- paste(yName,paste(varsV,collapse=' + '),sep=' ~ ')
  modelV <- lm(formulaV,data=dTrainV)
  print(summary(modelV))
  dTrainV$pred <- predict(modelV,newdata=dTrainV)
  print(paste('train rmse',rmse(dTrainV$pred,dTrainV[[yName]])))
  print(WVPlots::ScatterHist(dTrainV,'pred',yName,
                             paste(mTitle,'train'),
                             smoothmethod='lm',annot_size=2))
  dTestV$pred <- predict(modelV,newdata=dTestV)
  print(paste('test rmse',rmse(dTestV$pred,dTestV[[yName]])))
  print(WVPlots::ScatterHist(dTestV,'pred',yName,
                             paste(mTitle,'test'),
                             smoothmethod='lm',annot_size=2))
} else {
  print("cross model not in library")
}
## [1] "working vtreat cross model"
##  [1] "x1_catN"  "x2_catN"  "x2_catD"  "x3_catN"  "x4_catP"  "x4_catN" 
##  [7] "x5_catN"  "x6_catP"  "x6_catN"  "x7_catN"  "x7_catD"  "x8_catN" 
## [13] "x8_catD"  "x9_catN"  "x10_catP" "x10_catN" "n3_catD" 
## 
## Call:
## lm(formula = formulaV, data = dTrainV)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2362 -0.7503 -0.0042  0.7292  3.8354 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.08983    0.89523  -2.334 0.019674 *  
## x1_catN      0.90032    0.03766  23.905  < 2e-16 ***
## x2_catN      0.90069    0.06542  13.767  < 2e-16 ***
## x2_catD      0.46364    0.20138   2.302 0.021419 *  
## x3_catN      0.83154    0.09666   8.603  < 2e-16 ***
## x4_catP     -5.25998    2.72649  -1.929 0.053847 .  
## x4_catN      0.96990    0.04712  20.584  < 2e-16 ***
## x5_catN      0.96030    0.04440  21.630  < 2e-16 ***
## x6_catP      8.08470    3.03272   2.666 0.007742 ** 
## x6_catN      0.98357    0.07655  12.848  < 2e-16 ***
## x7_catN      0.92333    0.03931  23.491  < 2e-16 ***
## x7_catD      0.64209    0.18859   3.405 0.000676 ***
## x8_catN      1.02752    0.03826  26.853  < 2e-16 ***
## x8_catD     -0.40701    0.22186  -1.834 0.066732 .  
## x9_catN      0.98006    0.03461  28.317  < 2e-16 ***
## x10_catP     5.15991    4.07040   1.268 0.205066    
## x10_catN     0.94034    0.03306  28.446  < 2e-16 ***
## n3_catD     -0.02438    0.01618  -1.507 0.132033    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.107 on 1982 degrees of freedom
## Multiple R-squared:  0.7262, Adjusted R-squared:  0.7239 
## F-statistic: 309.3 on 17 and 1982 DF,  p-value: < 2.2e-16
## 
## [1] "train rmse 1.101539755982"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                 grob
## 1 1 (2-2,1-1) arrange       gtable[layout]
## 2 2 (2-2,2-2) arrange       gtable[layout]
## 3 3 (3-3,1-1) arrange       gtable[layout]
## 4 4 (3-3,2-2) arrange       gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1625]
## [1] "test rmse 1.05281991330366"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                 grob
## 1 1 (2-2,1-1) arrange       gtable[layout]
## 2 2 (2-2,2-2) arrange       gtable[layout]
## 3 3 (3-3,1-1) arrange       gtable[layout]
## 4 4 (3-3,2-2) arrange       gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1778]
sSigma = 100
print(paste('effects model, smoothing=',sSigma))
## [1] "effects model, smoothing= 100"
bCoder <- trainEffectCoderLSmooth(dTrain,yName,vars,sSigma)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
## 
## Call:
## lm(formula = formulaB, data = dTrainB)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0123 -0.4071  0.0243  0.4038  2.0010 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.03157    0.01411   2.238   0.0253 *  
## x1           0.46900    0.03433  13.661  < 2e-16 ***
## x2           0.43635    0.05605   7.785 1.12e-14 ***
## x3           0.35879    0.08629   4.158 3.35e-05 ***
## x4           0.48251    0.04054  11.902  < 2e-16 ***
## x5           0.49399    0.03994  12.369  < 2e-16 ***
## x6           0.36647    0.06119   5.989 2.50e-09 ***
## x7           0.45372    0.03566  12.725  < 2e-16 ***
## x8           0.51743    0.03583  14.443  < 2e-16 ***
## x9           0.44220    0.03236  13.667  < 2e-16 ***
## x10          0.46038    0.02866  16.066  < 2e-16 ***
## n1           2.50847    0.35889   6.990 3.77e-12 ***
## n2           2.48761    0.36780   6.764 1.77e-11 ***
## n3           3.15687    0.35059   9.004  < 2e-16 ***
## n4           2.56514    0.36398   7.048 2.51e-12 ***
## n5           1.88568    0.36896   5.111 3.52e-07 ***
## n6           2.19744    0.37914   5.796 7.90e-09 ***
## n7           2.08384    0.35166   5.926 3.66e-09 ***
## n8           2.36660    0.34844   6.792 1.46e-11 ***
## n9           2.46809    0.35716   6.910 6.51e-12 ***
## n10          2.20245    0.35444   6.214 6.30e-10 ***
## n11          2.68048    0.34446   7.782 1.15e-14 ***
## n12          1.99004    0.37429   5.317 1.18e-07 ***
## n13          2.61391    0.37424   6.984 3.90e-12 ***
## n14          2.43482    0.35298   6.898 7.09e-12 ***
## n15          2.57561    0.36112   7.132 1.38e-12 ***
## n16          2.43798    0.34197   7.129 1.41e-12 ***
## n17          2.25436    0.34132   6.605 5.11e-11 ***
## n18          2.52469    0.35400   7.132 1.39e-12 ***
## n19          2.50887    0.37768   6.643 3.97e-11 ***
## n20          2.24130    0.34401   6.515 9.21e-11 ***
## n21          2.25363    0.36182   6.229 5.75e-10 ***
## n22          2.29114    0.33994   6.740 2.08e-11 ***
## n23          1.94294    0.33639   5.776 8.88e-09 ***
## n24          2.42563    0.35719   6.791 1.47e-11 ***
## n25          1.98204    0.35585   5.570 2.90e-08 ***
## n26          2.16032    0.33883   6.376 2.26e-10 ***
## n27          2.35022    0.37638   6.244 5.21e-10 ***
## n28          2.07056    0.34846   5.942 3.32e-09 ***
## n29          2.03861    0.35614   5.724 1.20e-08 ***
## n30          2.27701    0.34654   6.571 6.40e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6266 on 1959 degrees of freedom
## Multiple R-squared:  0.9132, Adjusted R-squared:  0.9115 
## F-statistic: 515.5 on 40 and 1959 DF,  p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 0.620113653647223"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
                           paste('effects model train, smoothing=',sSigma),
                           smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                 grob
## 1 1 (2-2,1-1) arrange       gtable[layout]
## 2 2 (2-2,2-2) arrange       gtable[layout]
## 3 3 (3-3,1-1) arrange       gtable[layout]
## 4 4 (3-3,2-2) arrange       gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1955]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.6938494493661"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
                           paste('effects model test, smoothing=',sSigma),
                           smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                 grob
## 1 1 (2-2,1-1) arrange       gtable[layout]
## 2 2 (2-2,2-2) arrange       gtable[layout]
## 3 3 (3-3,1-1) arrange       gtable[layout]
## 4 4 (3-3,2-2) arrange       gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.2108]
if(!is.null(cl)) {
  parallel::stopCluster(cl)
  cl <- NULL
}